# -*- coding: utf-8 -*-
'''
@author: Hugo
@file: 测试.py
@time: 2021/9/8 18:08
'''
import re
from icecream import ic
from scrapy import Selector

body = '<html><head><title>Hello World</title></head><body></body></html>'
selector = Selector(text = body)
标题 = selector.xpath('//title/text()')
# ic(标题)
'''
extract()  返回所有元素结果
extract_fist 返回第一元素结果
'''
# ic(标题.extract())
# ic(标题.extract_first())
html = '''
   <html>
    <head>
     <base href='http://example.com/' />
     <title>Example website</title>
    </head>
    <body>
     <div id='images'>
      <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
      <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
      <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>   
      <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>   
      <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
     </div>
    </body>
   </html>
   '''
网页 = Selector(text = html)
a标签 = 网页.xpath('//a')
# ic(a标签)
# ic(a标签.xpath('./img'))
# ic(a标签.extract())
a标签文本 = 网页.xpath('//a/text()').extract()
# 第一个a标签文本 = 网页.xpath('//a[@href="image1.html"]/text()').extract_first()
# ic(re.search(r'Name: (.+1?)', 第一个a标签文本).group(1))
res = 网页.css('a')
# css 可以包含标签一起提取
res1 = 网页.css('a[href="image1.html"]').extract()
res2 = 网页.css('a[href="image1.html"]').extract_first()
res3 = 网页.css('a[href="image1.html"] img').extract_first()
res4 = 网页.css('a[href="image1.html"]::text').extract_first()
res5 = 网页.css('a[href="image1.html"] img::attr("src")').extract_first()
# ic(res1)
# ic(res2)
# ic(res3)
# ic(res4)
# ic(res5)

res6 = 网页.xpath('//a/text()').re('Name: (.*1?)')
res7 = 网页.xpath('//a/text()').re('(.*?): (.*)')
res8 = 网页.xpath('.').re('Name: (.*)<br>')
ic(res6)
ic(res7)
print(res8)

